R Packages Dependencies


In [1]:
import pandas
import deps
import itertools
from matplotlib import pyplot as plt
from collections import OrderedDict

%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf')

# Workaround if changes are made to deps.py and "Run All" is hit
deps = reload(deps)

data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata.csv', index_col=None)
sources = ['github', 'cran']#, 'bioc']

In [2]:
graphs = OrderedDict()
for date in pandas.date_range(start='2013-09', end='2015-01', freq='1M'):
        graphs[date] = deps.create_graph_for(data, date)

Evolution of the number of packages


In [3]:
number = OrderedDict()
for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    number[date] = {'github': 0, 'cran': 0, 'both': 0}
    for name, package in graph.iteritems():
        github, cran = False, False
        
        for source in package.iterkeys():
            if source  == 'github':
                github = True
            elif source == 'cran':
                cran = True
        if github:
            number[date]['github'] += 1
        if cran:
            number[date]['cran'] += 1
        if github and cran:
            number[date]['both'] += 1
            
df_N = pandas.DataFrame.from_dict(number, orient='index')
df_N.index = pandas.to_datetime(df_N.index)
df_N = df_N.sort_index()
df_N['githubP'] = 100. * df_N['both'] / df_N['github']
df_N['cranP'] = 100. * df_N['both'] / df_N['cran']

ax = df_N[['github', 'cran', 'both']].plot(title=u'Number of available packages\n', 
                                         figsize=(8,4), ylim=(0,9000))
ax.legend(['github', 'cran', 'github $\cap$ cran'], title='on left y-axis', ncol=1, loc='best')

ax2 = ax.twinx()
ax2 = df_N[['githubP', 'cranP']].plot(ax=ax2, ylim=(0,100), style=['--', '--'], legend=False, grid=False)        
ax2.legend(['github', 'cran'], title='on right y-axis', ncol=2, loc='best')

ax2.set_yticklabels([str(int(v))+'%' for v in ax2.get_yticks()])
df_N


Out[3]:
both cran github githubP cranP
2013-09-30 509 4852 1817 28.013209 10.490519
2013-10-31 548 4966 1980 27.676768 11.035038
2013-11-30 579 5062 2142 27.030812 11.438167
2013-12-31 605 5154 2276 26.581722 11.738456
2014-01-31 641 5274 2466 25.993512 12.153963
2014-02-28 671 5372 2654 25.282592 12.490692
2014-03-31 713 5489 2898 24.603175 12.989616
2014-04-30 759 5619 3126 24.280230 13.507742
2014-05-31 805 5742 3392 23.732311 14.019505
2014-06-30 844 5848 3634 23.225096 14.432285
2014-07-31 902 5973 3958 22.789288 15.101289
2014-08-31 958 6102 4283 22.367499 15.699771
2014-09-30 1004 6215 4609 21.783467 16.154465
2014-10-31 1045 6261 5011 20.854121 16.690625
2014-11-30 1121 6451 5341 20.988579 17.377151
2014-12-31 1183 6560 5658 20.908448 18.033537
<matplotlib.figure.Figure at 0x7f31f7ca1f50>

Required CRAN Packages


In [4]:
data = {}
intersections = {}

for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    value = {k: set() for k in sources}
    n = 0
    
    for name, package in graph.iteritems():
        if 'cran' in package:
            n += 1
        for source in filter(lambda s: s in sources, package.iterkeys()):
            dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
            dependencies = filter(lambda p: p in graph and 'cran' in graph[p], dependencies)
            
            for dependency in dependencies:
                value[source].add(dependency)
            
    data[date] = OrderedDict()
    intersections[date] = OrderedDict()

    github = set(value['github'])
    cran = set(value['cran'])
    bioc = set(value.get('bioc', []))
    
    venn_sets = [
        ('overall github', len(github)), 
        ('overall cran', len(cran)), 
        ('github', len(github.difference(cran).difference(bioc))),
        ('cran', len(cran.difference(github).difference(bioc))),
        ('bioc', len(bioc.difference(github).difference(cran))), 
        ('github $\cap$ cran', len(github.intersection(cran).difference(bioc))),
        ('github $\cap$ bioc', len(github.intersection(bioc).difference(cran))),
        ('cran $\cap$ bioc', len(cran.intersection(bioc).difference(github))),
        ('github $\cap$ cran $\cap$ bioc', len(github.intersection(cran).intersection(bioc)))
    ]
    
    # Proportional
    venn_sets = map(lambda x: (x[0], x[1]), venn_sets)
    intersections[date] = OrderedDict(venn_sets)
    
    for source in sources: 
        data[date][source] = len(value[source]) * 100.0 / n
        
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()[['github', 'cran']]
ax = df.plot(title=u'Proportion of CRAN packages needed for packages from given source\n',
                 ylim=(10,30),
                 figsize=(8, 4))
ax.legend(ncol=2, loc='best')    
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
df


Out[4]:
github cran
2013-09-30 12.901896 24.422918
2013-10-31 13.592429 24.224728
2013-11-30 14.164362 24.022126
2013-12-31 14.784633 24.233605
2014-01-31 15.073948 24.061433
2014-02-28 15.543559 24.050633
2014-03-31 15.868100 24.029878
2014-04-30 16.177256 23.918847
2014-05-31 16.370603 23.859282
2014-06-30 16.603967 23.785910
2014-07-31 17.026620 24.024778
2014-08-31 17.420518 23.861029
2014-09-30 18.020917 23.716814
2014-10-31 18.559336 23.734228
2014-11-30 18.679275 23.732755
2014-12-31 19.085366 23.734756
<matplotlib.figure.Figure at 0x7f31f9a86d50>

In [5]:
df = pandas.DataFrame.from_dict(intersections, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()[['github', 'overall github', 'cran', 'overall cran', 'github $\cap$ cran']]

ax = df.plot(title=u'Size of the Venn sets containing CRAN packages needed by given set of sources\n',
             style=['b', 'b--', 'g', 'g--', 'r'],
             ylim=(0,2100),
             figsize=(8, 4))
ax.legend(['only github', 'at least github', 'only cran', 'at least cran', 'both'], title='Required by', ncol=3, loc='best')  

for key in df.columns:
    df[key+'%'] = 100.0 * df[key] / df_N['cran']
df


Out[5]:
github overall github cran overall cran github $\cap$ cran github% overall github% cran% overall cran% github $\cap$ cran%
2013-09-30 116 626 675 1185 510 2.390767 12.901896 13.911789 24.422918 10.511129
2013-10-31 136 675 664 1203 539 2.738623 13.592429 13.370922 24.224728 10.853806
2013-11-30 153 717 652 1216 564 3.022521 14.164362 12.880284 24.022126 11.141841
2013-12-31 174 762 661 1249 588 3.376019 14.784633 12.824990 24.233605 11.408615
2014-01-31 184 795 658 1269 611 3.488813 15.073948 12.476299 24.061433 11.585135
2014-02-28 191 835 648 1292 644 3.555473 15.543559 12.062547 24.050633 11.988086
2014-03-31 200 871 648 1319 671 3.643651 15.868100 11.805429 24.029878 12.224449
2014-04-30 216 909 651 1344 693 3.844100 16.177256 11.585691 23.918847 12.333155
2014-05-31 225 940 655 1370 715 3.918495 16.370603 11.407175 23.859282 12.452107
2014-06-30 237 971 657 1391 734 4.052668 16.603967 11.234610 23.785910 12.551300
2014-07-31 249 1017 667 1435 768 4.168759 17.026620 11.166918 24.024778 12.857860
2014-08-31 273 1063 666 1456 790 4.473943 17.420518 10.914454 23.861029 12.946575
2014-09-30 310 1120 664 1474 810 4.987932 18.020917 10.683829 23.716814 13.032985
2014-10-31 336 1162 660 1486 826 5.366555 18.559336 10.541447 23.734228 13.192781
2014-11-30 339 1205 665 1531 866 5.254999 18.679275 10.308479 23.732755 13.424275
2014-12-31 359 1252 664 1557 893 5.472561 19.085366 10.121951 23.734756 13.612805
<matplotlib.figure.Figure at 0x7f3240f9e8d0>

In [6]:
"""
data = OrderedDict()

combinations = [set()]
for n in range(len(sources)):
    combinations += [set(x) for x in itertools.combinations(sources, n+1)]

# Gather all packages from CRAN
cran = set()
for date, graph in graphs.iteritems():
    for name, package in graph.iteritems():
        if 'cran' in package:
            cran.add(name)

for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    data[date] = {k: set()for k in cran}
    
    for name, package in graph.iteritems():
        for source in package.iterkeys():
            dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
            # Filter dependencies to CRAN
            dependencies = filter(lambda p: p in graph and 'cran' in graph[p], dependencies)
            # Every dependency is required by current 'name' from 'source'
            for dependency in dependencies:
                data[date][dependency].add(source)   
    
    # Compute the index in "combinations" for every CRAN packages
    for name, source in data[date].iteritems():
        data[date][name] = combinations.index(source)
        
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()

df = df.T
df = df.sort(columns=df.columns[-1])

import numpy as np
column_labels = df.columns
row_labels = df.index
fig, ax = plt.subplots(figsize=(20,15))
heatmap = ax.pcolor(df, cmap=plt.cm.Spectral)

cbar = plt.colorbar(heatmap)
cbar.ax.get_yaxis().set_ticks([])
for j, lab in enumerate(combinations):
    cbar.ax.text(.5, (2 * j + 1) / 16.0, lab, ha='center', va='center')
cbar.ax.get_yaxis().labelpad = 30


plt.show()        
"""
print



Packages with dependencies and at least one external dependency


In [7]:
data = {}

for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    value = {k: {'n':0, 'e': 0} for k in sources + ['bioc']}
    
    for name, package in graph.iteritems():
        for source in package.iterkeys():
            dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
            if len(dependencies) > 0:
                value[source]['n'] += 1
                for dep in dependencies:
                    if dep not in graph or source not in graph[dep]:
                        value[source]['e'] += 1
                        break
    
    data[date] = OrderedDict()
    for source in sources: 
        data[date][source] = value[source]['e'] * 100.0 / value[source]['n']
        
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()
ax = df[['github', 'cran']].plot(title=u'Proportion of packages in given source with at least one external dependency, \nconsidering packages with at least one dependency\n',
                 ylim=(0,100),
                 figsize=(8, 4))
ax.legend(ncol=3, loc='best')    
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
print
df



Out[7]:
github cran
2013-09-30 82.954545 4.727273
2013-10-31 82.515549 4.800565
2013-11-30 82.994924 4.756980
2013-12-31 83.423423 4.826190
2014-01-31 83.480663 4.744646
2014-02-28 83.865979 4.910141
2014-03-31 83.783784 4.900125
2014-04-30 84.312007 4.912068
2014-05-31 84.060334 4.840614
2014-06-30 83.485540 4.356607
2014-07-31 82.724719 4.405163
2014-08-31 82.249101 4.357358
2014-09-30 81.661109 4.394427
2014-10-31 80.829160 4.407860
2014-11-30 80.601307 4.431352
2014-12-31 80.118110 4.087262
<matplotlib.figure.Figure at 0x7f3240fb3090>

In [8]:
for date, graph in graphs.iteritems():
    github = [(name, package['github']) for name, package in graph.iteritems() if 'github' in package]
    unsatisfied_deps = 0
    cran_deps = 0
    github_deps = 0
    
    for name, package in github:
        dependencies = filter(lambda p: p not in deps.R_packages, package['Dependencies'])
        if len(dependencies) > 0:
            
            for dep in dependencies:
                github_deps += 1
                if dep not in graph or 'github' not in graph[dep]:
                    unsatisfied_deps += 1
                    if dep in graph and 'cran' in graph[dep]:
                        cran_deps += 1
        
    print date, len(github), github_deps, unsatisfied_deps, cran_deps, 100.0 * cran_deps / unsatisfied_deps


2013-09-30 00:00:00 1817 4708 2786 2332 83.704235463
2013-10-31 00:00:00 1980 5202 3062 2561 83.6381450033
2013-11-30 00:00:00 2142 5660 3331 2794 83.8787151006
2013-12-31 00:00:00 2276 6103 3609 3028 83.9013577168
2014-01-31 00:00:00 2466 6583 3886 3278 84.3540916109
2014-02-28 00:00:00 2654 7131 4199 3571 85.0440581091
2014-03-31 00:00:00 2898 7859 4617 3938 85.2934806151
2014-04-30 00:00:00 3126 8579 5035 4299 85.3823237339
2014-05-31 00:00:00 3392 9149 5341 4535 84.909193035
2014-06-30 00:00:00 3634 9777 5648 4783 84.6848441926
2014-07-31 00:00:00 3958 10602 6045 5113 84.582299421
2014-08-31 00:00:00 4283 11400 6374 5404 84.7819265767
2014-09-30 00:00:00 4609 12307 6789 5775 85.0640742377
2014-10-31 00:00:00 5011 13429 7291 6210 85.1735015773
2014-11-30 00:00:00 5341 14251 7703 6591 85.5640659483
2014-12-31 00:00:00 5658 15128 8107 6948 85.7037128408

Packages with no dependency


In [9]:
data = {}

for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    value = {k: {'n':0, '0':0, '+': 0} for k in sources + ['bioc']}
    
    for name, package in graph.iteritems():
        for source in package.iterkeys():
            dependencies = filter(lambda p: p not in deps.R_packages, package[source]['Dependencies'])
            value[source]['n'] += 1
            if len(dependencies) == 0:
                value[source]['0'] += 1
            else:
                value[source]['+'] += 1
    
    
    data[date] = OrderedDict()
    for source in sources: 
        data[date]['0 on {}'.format(source)] = value[source]['0'] * 100.0 / value[source]['n']
        data[date]['>0 on {}'.format(source)] = value[source]['+'] * 100.0 / value[source]['n']
        
df = pandas.DataFrame.from_dict(data, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()

ax = df[['0 on github', '0 on cran']].plot(title=u'Proportion of packages with no dependency\n', 
                 ylim=(20,50),
                 figsize=(8, 4))
ax.legend(['github', 'cran'], ncol=2, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
print
df



Out[9]:
0 on github >0 on github 0 on cran >0 on cran
2013-09-30 27.352779 72.647221 43.322341 56.677659
2013-10-31 26.919192 73.080808 42.952074 57.047926
2013-11-30 26.423903 73.576097 42.690636 57.309364
2013-12-31 26.845343 73.154657 42.510671 57.489329
2014-01-31 26.601784 73.398216 42.453546 57.546454
2014-02-28 26.902788 73.097212 41.995532 58.004468
2014-03-31 27.225673 72.774327 41.628712 58.371288
2014-04-30 26.999360 73.000640 41.306282 58.693718
2014-05-31 27.682783 72.317217 40.996169 59.003831
2014-06-30 27.682994 72.317006 40.731874 59.268126
2014-07-31 28.044467 71.955533 40.331492 59.668508
2014-08-31 28.578099 71.421901 40.199934 59.800066
2014-09-30 28.422651 71.577349 39.951730 60.048270
2014-10-31 28.277789 71.722211 39.849864 60.150136
2014-11-30 28.384198 71.615802 39.482251 60.517749
2014-12-31 28.172499 71.827501 39.207317 60.792683
<matplotlib.figure.Figure at 0x7f321db15f50>

Sources needed for GitHub packages

We compute, for every GitHub package, the sources that are required to install the package.


In [10]:
installable = OrderedDict()

    
for date, graph in graphs.iteritems():
    date = '{}-{}-{}'.format(date.year, date.month, date.day)
    installable[date] = {'number': 0, 'empty set': 0, 'github': 0, 'cran': 0, 'github and cran': 0, 'other': 0}
    for name, package in graph.iteritems():
        if 'github' in package:
            
            installable[date]['number'] += 1

            other = True
            
            dependencies = filter(lambda p: p not in deps.R_packages, package['github']['Dependencies'])
            n = len(dependencies)
            
            outside = filter(lambda p: p not in graph, dependencies)
            # Need other sources?
            if len(outside) > 0:
                installable[date]['other'] += 1
            else:
                # Can be installed with packages in our graph..
                # ... with nothing?
                if len(dependencies) == 0:
                    installable[date]['empty set'] += 1
                    other = False
                # ... with github?
                if len(filter(lambda p: 'github' in graph[p], dependencies)) == n:
                    installable[date]['github'] += 1
                    other = False
                # .... with cran?
                if len(filter(lambda p: 'cran' in graph[p], dependencies)) == n:
                    installable[date]['cran'] += 1
                    other = False
                # ....with both?
                if len(filter(lambda p: 'github' in graph[p] or 'cran' in graph[p], dependencies)) == n:
                    installable[date]['github and cran'] += 1
                    other = False
                # ... with something else
                if other:
                    installable[date]['other'] += 1

In [11]:
df = pandas.DataFrame.from_dict(installable, orient='index')
df.index = pandas.to_datetime(df.index)
df = df.sort_index()
for key in ['empty set', 'github', 'cran', 'github and cran']:
    df[key] = 100.0 * df[key] / df['number']

df = df[['empty set', 'github', 'cran', 'github and cran']]
ax = df.plot(title=u'% of github packages with dependencies satisfied using given repositories\n', 
             ylim=(0,100),
             style=['k--', 'b', 'g', 'r'],
             figsize=(8, 4))

ax.legend(['R core packages', 'github', 'cran', 'github$\cup$cran'], ncol=2, loc='best')
ax.set_yticklabels([str(int(v))+'%' for v in ax.get_yticks()])
print df


            empty set     github       cran  github and cran
2013-09-30  27.352779  39.735828  83.214089        88.277380
2013-10-31  26.919192  39.696970  82.323232        87.979798
2013-11-30  26.423903  38.935574  82.446312        88.281979
2013-12-31  26.845343  38.971880  82.732865        88.444640
2014-01-31  26.601784  38.726683  83.454988        88.888889
2014-02-28  26.902788  38.696307  83.911078        89.148455
2014-03-31  27.225673  39.026915  84.195997        89.475500
2014-04-30  26.999360  38.451695  83.589251        89.315419
2014-05-31  27.682783  39.209906  83.579009        89.268868
2014-06-30  27.682994  39.625757  83.736929        89.378096
2014-07-31  28.044467  40.474987  83.501769        89.565437
2014-08-31  28.578099  41.256129  83.539575        89.960308
2014-09-30  28.422651  41.549143  83.792580        90.128010
2014-10-31  28.277789  42.027539  83.636001        90.241469
2014-11-30  28.384198  42.276727  84.328777        90.563565
2014-12-31  28.172499  42.453164  84.429127        90.615058
<matplotlib.figure.Figure at 0x7f321db15d90>